My PhD in Weeks
This is your PhD, and it's ending one week at a time.
| 1 | 2 | 3 | 4 | |
| 1 | ||||
| 5 | ||||
| 10 | ||||
| 15 | ||||
| 20 | ||||
| 25 | ||||
| x | ||||
| x | ||||
| x | ||||
| 30 | x | |||
| x | ||||
| x | ||||
| x | ||||
| x | ||||
| 35 | x | |||
| x | ||||
| x | ||||
| x | ||||
| x | ||||
| 40 | x | |||
| x | ||||
| x | ||||
| x | ||||
| x | ||||
| 45 | x | |||
| x | ||||
| x | ||||
| x | ||||
| x | ||||
| 50 | x | |||
| x | ||||
| x |
This is your PhD, and it's ending one week at a time.
| 1 | 2 | 3 | 4 | |
| 1 | ||||
| 5 | ||||
| 10 | ||||
| 15 | ||||
| 20 | ||||
| 25 | ||||
| x | ||||
| x | ||||
| x | ||||
| 30 | x | |||
| x | ||||
| x | ||||
| x | ||||
| x | ||||
| 35 | x | |||
| x | ||||
| x | ||||
| x | ||||
| x | ||||
| 40 | x | |||
| x | ||||
| x | ||||
| x | ||||
| x | ||||
| 45 | x | |||
| x | ||||
| x | ||||
| x | ||||
| x | ||||
| 50 | x | |||
| x | ||||
| x |
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import keras.backend as K
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import logistic, multivariate_normal, norm
from scipy.special import expit
from keras.models import Model, Sequential
from keras.layers import Activation, Dense, Dot, Input
from keras.utils.vis_utils import model_to_dot
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
from IPython.display import SVG
plt.style.use('seaborn-notebook')
# display animation inline
plt.rc('animation', html='html5')
sns.set_context('notebook')
np.set_printoptions(precision=2,
edgeitems=3,
linewidth=80,
suppress=True)
K.tf.__version__
LATENT_DIM = 2
NOISE_DIM = 3
BATCH_SIZE = 128
PRIOR_VARIANCE = 2.
w_min, w_max = -5, 5
w1, w2 = np.mgrid[w_min:w_max:300j, w_min:w_max:300j]
w_grid = np.dstack((w1, w2))
w_grid.shape
prior = multivariate_normal(mean=np.zeros(LATENT_DIM),
cov=PRIOR_VARIANCE)
log_prior = prior.logpdf(w_grid)
log_prior.shape
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, log_prior, cmap='magma')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
x1 = np.array([ 1.5, 1.])
x2 = np.array([-1.5, 1.])
x3 = np.array([- .5, -1.])
X = np.vstack((x1, x2, x3))
X.shape
y1 = 1
y2 = 1
y3 = 0
y = np.stack((y1, y2, y3))
y.shape
def log_likelihood(w, x, y):
# equiv. to negative binary cross entropy
return np.log(expit(np.dot(w.T, x)*(-1)**(1-y)))
llhs = log_likelihood(w_grid.T, X.T, y)
llhs.shape
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(6, 2))
fig.tight_layout()
for i, ax in enumerate(axes):
ax.contourf(w1, w2, llhs[::,::,i], cmap=plt.cm.magma)
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
ax.set_title('$p(y_{{{0}}} \mid x_{{{0}}}, w)$'.format(i+1))
ax.set_xlabel('$w_1$')
if not i:
ax.set_ylabel('$w_2$')
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, np.sum(llhs, axis=2),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap='magma')
ax.scatter(*X.T, c=y, cmap='coolwarm', marker=',')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
$T_{\psi}(x, z)$
Here we consider
$T_{\psi}(w)$
$T_{\psi} : \mathbb{R}^2 \to \mathbb{R}$
discriminator = Sequential(name='discriminator')
discriminator.add(Dense(10, input_dim=LATENT_DIM, activation='relu'))
discriminator.add(Dense(20, activation='relu'))
discriminator.add(Dense(1, activation=None, name='logit'))
discriminator.add(Activation('sigmoid'))
discriminator.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['binary_accuracy'])
ratio_estimator = Model(
inputs=discriminator.inputs,
outputs=discriminator.get_layer(name='logit').output)
SVG(model_to_dot(discriminator, show_shapes=True)
.create(prog='dot', format='svg'))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
Initial density ratio, prior to any training
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
discriminator.evaluate(prior.rvs(size=5), np.zeros(5))
$z_{\phi}(x, \epsilon)$
Here we only consider
$z_{\phi}(\epsilon)$
$z_{\phi}: \mathbb{R}^3 \to \mathbb{R}^2$
inference = Sequential()
inference.add(Dense(10, input_dim=NOISE_DIM, activation='relu'))
inference.add(Dense(20, activation='relu'))
inference.add(Dense(LATENT_DIM, activation=None))
inference.summary()
The variational parameters $\phi$ are the trainable weights of the approximate inference model
phi = inference.trainable_weights
phi
SVG(model_to_dot(inference, show_shapes=True)
.create(prog='dot', format='svg'))
w_sample_prior = prior.rvs(size=BATCH_SIZE)
w_sample_prior.shape
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
w_sample_posterior = inference.predict(eps)
w_sample_posterior.shape
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)))
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
metrics = discriminator.evaluate(inputs, targets)
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
def train_animate(epoch_num, batch_size=128, steps_per_epoch=20):
for step in range(steps_per_epoch):
w_sample_prior = prior.rvs(size=batch_size)
eps = np.random.randn(batch_size, NOISE_DIM)
w_sample_posterior = inference.predict(eps)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(batch_size), np.ones(batch_size)))
metrics = discriminator.train_on_batch(inputs, targets)
ax.cla()
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
train_info['epoch'] = epoch_num
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('epoch: {epoch:2d}\n'
'accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
return ax
FuncAnimation(fig, train_animate, frames=50,
interval=200, # 5 fps
blit=False)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)))
metrics = discriminator.evaluate(inputs, targets)
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
def set_trainable(model, trainable):
"""inorder traversal"""
model.trainable = trainable
if isinstance(model, Model): # i.e. has layers
for layer in model.layers:
set_trainable(layer, trainable)
y_pred = K.sigmoid(K.dot(
K.constant(w_grid),
K.transpose(K.constant(X))))
y_pred
y_true = K.ones((300, 300, 1))*K.constant(y)
y_true
llhs_keras = - K.binary_crossentropy(
y_pred,
y_true,
from_logits=False)
sess = K.get_session()
np.allclose(np.sum(llhs, axis=-1),
sess.run(K.sum(llhs_keras, axis=-1)))
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, sess.run(K.sum(llhs_keras, axis=-1)),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
def make_elbo(ratio_estimator):
set_trainable(ratio_estimator, False)
def elbo(y_true, w_sample):
kl_estimate = ratio_estimator(w_sample)
y_pred = K.dot(w_sample, K.transpose(K.constant(X)))
log_likelihood = - K.binary_crossentropy(y_pred, y_true,
from_logits=True)
return K.mean(log_likelihood-kl_estimate, axis=-1)
return elbo
elbo = make_elbo(ratio_estimator)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, sess.run(elbo(y_true, K.constant(w_grid))),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
inference_loss = lambda y_true, w_sample: -make_elbo(ratio_estimator)(y_true, w_sample)
inference.compile(loss=inference_loss,
optimizer='adam')
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
y_true = K.repeat_elements(K.expand_dims(K.constant(y), axis=0),
axis=0, rep=BATCH_SIZE)
y_true
sess.run(K.mean(elbo(y_true, inference(K.constant(eps))), axis=-1))
inference.evaluate(eps, np.tile(y, reps=(BATCH_SIZE, 1)))
for epoch in range(3*200):
set_trainable(ratio_estimator, False)
for _ in range(1):
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
metrics_inference = inference.train_on_batch(eps, np.tile(y, reps=(BATCH_SIZE, 1)))
set_trainable(discriminator, True)
for _ in range(3*50):
w_sample_prior = prior.rvs(size=BATCH_SIZE)
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
w_sample_posterior = inference.predict(eps)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)))
metrics_discrim = discriminator.train_on_batch(inputs, targets)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)))
metrics = discriminator.evaluate(inputs, targets)
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import keras.backend as K
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import logistic, multivariate_normal, norm
from scipy.special import expit
from keras.models import Model, Sequential
from keras.layers import Activation, Dense, Dot, Input
from keras.utils.vis_utils import model_to_dot
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
from IPython.display import SVG
plt.style.use('seaborn-notebook')
# display animation inline
plt.rc('animation', html='html5')
sns.set_context('notebook')
np.set_printoptions(precision=2,
edgeitems=3,
linewidth=80,
suppress=True)
K.tf.__version__
LATENT_DIM = 2
NOISE_DIM = 3
BATCH_SIZE = 128
D_BATCH_SIZE = 128
G_BATCH_SIZE = 128
PRIOR_VARIANCE = 2.
w_min, w_max = -5, 5
w1, w2 = np.mgrid[w_min:w_max:300j, w_min:w_max:300j]
w_grid = np.dstack((w1, w2))
w_grid.shape
prior = multivariate_normal(mean=np.zeros(LATENT_DIM),
cov=PRIOR_VARIANCE)
log_prior = prior.logpdf(w_grid)
log_prior.shape
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, log_prior, cmap='magma')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
x1 = np.array([ 1.5, 1.])
x2 = np.array([-1.5, 1.])
x3 = np.array([- .5, -1.])
X = np.vstack((x1, x2, x3))
X.shape
y1 = 1
y2 = 1
y3 = 0
y = np.stack((y1, y2, y3))
y.shape
def log_likelihood(w, x, y):
# equiv. to negative binary cross entropy
return np.log(expit(np.dot(w.T, x)*(-1)**(1-y)))
llhs = log_likelihood(w_grid.T, X.T, y)
llhs.shape
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(6, 2))
fig.tight_layout()
for i, ax in enumerate(axes):
ax.contourf(w1, w2, llhs[::,::,i], cmap=plt.cm.magma)
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
ax.set_title('$p(y_{{{0}}} \mid x_{{{0}}}, w)$'.format(i+1))
ax.set_xlabel('$w_1$')
if not i:
ax.set_ylabel('$w_2$')
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, np.sum(llhs, axis=2),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap='magma')
ax.scatter(*X.T, c=y, cmap='coolwarm', marker=',')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
$T_{\psi}(x, z)$
Here we consider
$T_{\psi}(w)$
$T_{\psi} : \mathbb{R}^2 \to \mathbb{R}$
discriminator = Sequential(name='discriminator')
discriminator.add(Dense(10, input_dim=LATENT_DIM, activation='relu'))
discriminator.add(Dense(20, activation='relu'))
discriminator.add(Dense(1, activation=None, name='logit'))
discriminator.add(Activation('sigmoid'))
discriminator.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['binary_accuracy'])
ratio_estimator = Model(
inputs=discriminator.inputs,
outputs=discriminator.get_layer(name='logit').output)
SVG(model_to_dot(discriminator, show_shapes=True)
.create(prog='dot', format='svg'))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
Initial density ratio, prior to any training
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
discriminator.evaluate(prior.rvs(size=5), np.zeros(5))
$z_{\phi}(x, \epsilon)$
Here we only consider
$z_{\phi}(\epsilon)$
$z_{\phi}: \mathbb{R}^3 \to \mathbb{R}^2$
inference = Sequential()
inference.add(Dense(10, input_dim=NOISE_DIM, activation='relu'))
inference.add(Dense(20, activation='relu'))
inference.add(Dense(LATENT_DIM, activation=None))
inference.summary()
The variational parameters $\phi$ are the trainable weights of the approximate inference model
phi = inference.trainable_weights
phi
SVG(model_to_dot(inference, show_shapes=True)
.create(prog='dot', format='svg'))
w_sample_prior = prior.rvs(size=BATCH_SIZE)
w_sample_prior.shape
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
w_sample_posterior = inference.predict(eps)
w_sample_posterior.shape
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)))
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
metrics = discriminator.evaluate(inputs, targets)
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
def train_animate(epoch_num, batch_size=128, steps_per_epoch=20):
for step in range(steps_per_epoch):
w_sample_prior = prior.rvs(size=batch_size)
eps = np.random.randn(batch_size, NOISE_DIM)
w_sample_posterior = inference.predict(eps)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(batch_size), np.ones(batch_size)))
metrics = discriminator.train_on_batch(inputs, targets)
ax.cla()
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
train_info['epoch'] = epoch_num
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('epoch: {epoch:2d}\n'
'accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
return ax
FuncAnimation(fig, train_animate, frames=50,
interval=200, # 5 fps
blit=False)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)))
metrics = discriminator.evaluate(inputs, targets)
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
y_pred = K.sigmoid(K.dot(
K.constant(w_grid),
K.transpose(K.constant(X))))
y_pred
y_true = K.ones((300, 300, 1))*K.constant(y)
y_true
llhs_keras = - K.binary_crossentropy(
y_pred,
y_true,
from_logits=False)
sess = K.get_session()
np.allclose(np.sum(llhs, axis=-1),
sess.run(K.sum(llhs_keras, axis=-1)))
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, sess.run(K.sum(llhs_keras, axis=-1)),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
def make_elbo(ratio_estimator):
def elbo(y_true, w_sample):
kl_estimate = ratio_estimator(w_sample)
y_pred = K.dot(w_sample, K.transpose(K.constant(X)))
log_likelihood = - K.binary_crossentropy(y_pred, y_true,
from_logits=True)
return K.mean(log_likelihood-kl_estimate, axis=-1)
return elbo
elbo = make_elbo(ratio_estimator)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, sess.run(elbo(y_true, K.constant(w_grid))),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
inference.compile(loss=make_elbo(ratio_estimator),
optimizer='adam')
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
inference.evaluate(eps, np.tile(y, reps=(BATCH_SIZE, 1)))
# equiv. to use of tile above
y_true = K.repeat_elements(K.expand_dims(K.constant(y), axis=0),
axis=0, rep=BATCH_SIZE)
y_true
sess.run(K.mean(elbo(y_true, inference(K.constant(eps))), axis=-1))
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import keras.backend as K
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import logistic, multivariate_normal, norm
from scipy.special import expit
from keras.models import Model, Sequential
from keras.layers import Activation, Dense, Dot, Input
from keras.utils.vis_utils import model_to_dot
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
from IPython.display import SVG
plt.style.use('seaborn-notebook')
# display animation inline
plt.rc('animation', html='html5')
sns.set_context('notebook')
np.set_printoptions(precision=2,
edgeitems=3,
linewidth=80,
suppress=True)
K.tf.__version__
LATENT_DIM = 2
NOISE_DIM = 3
BATCH_SIZE = 128
D_BATCH_SIZE = 128
G_BATCH_SIZE = 128
PRIOR_VARIANCE = 2.
w_min, w_max = -5, 5
w1, w2 = np.mgrid[w_min:w_max:300j, w_min:w_max:300j]
w_grid = np.dstack((w1, w2))
w_grid.shape
prior = multivariate_normal(mean=np.zeros(LATENT_DIM),
cov=PRIOR_VARIANCE)
log_prior = prior.logpdf(w_grid)
log_prior.shape
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, log_prior, cmap='magma')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
x1 = np.array([ 1.5, 1.])
x2 = np.array([-1.5, 1.])
x3 = np.array([- .5, -1.])
X = np.vstack((x1, x2, x3))
X.shape
y1 = 1
y2 = 1
y3 = 0
y = np.stack((y1, y2, y3))
y.shape
def log_likelihood(w, x, y):
# equiv. to negative binary cross entropy
return np.log(expit(np.dot(w.T, x)*(-1)**(1-y)))
llhs = log_likelihood(w_grid.T, X.T, y)
llhs.shape
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(6, 2))
fig.tight_layout()
for i, ax in enumerate(axes):
ax.contourf(w1, w2, llhs[::,::,i], cmap=plt.cm.magma)
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
ax.set_title('$p(y_{{{0}}} \mid x_{{{0}}}, w)$'.format(i+1))
ax.set_xlabel('$w_1$')
if not i:
ax.set_ylabel('$w_2$')
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, np.sum(llhs, axis=2),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap='magma')
ax.scatter(*X.T, c=y, cmap='coolwarm', marker=',')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
$T_{\psi}(x, z)$
Here we consider
$T_{\psi}(w)$
$T_{\psi} : \mathbb{R}^2 \to \mathbb{R}$
discriminator = Sequential(name='discriminator')
discriminator.add(Dense(10, input_dim=LATENT_DIM, activation='relu'))
discriminator.add(Dense(20, activation='relu'))
discriminator.add(Dense(1, activation=None, name='logit'))
discriminator.add(Activation('sigmoid'))
discriminator.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['binary_accuracy'])
ratio_estimator = Model(
inputs=discriminator.inputs,
outputs=discriminator.get_layer(name='logit').output)
SVG(model_to_dot(discriminator, show_shapes=True)
.create(prog='dot', format='svg'))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
Initial density ratio, prior to any training
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
discriminator.evaluate(prior.rvs(size=5), np.zeros(5))
$z_{\phi}(x, \epsilon)$
Here we only consider
$z_{\phi}(\epsilon)$
$z_{\phi}: \mathbb{R}^3 \to \mathbb{R}^2$
inference = Sequential()
inference.add(Dense(10, input_dim=NOISE_DIM, activation='relu'))
inference.add(Dense(20, activation='relu'))
inference.add(Dense(LATENT_DIM, activation=None))
inference.summary()
The variational parameters $\phi$ are the trainable weights of the approximate inference model
phi = inference.trainable_weights
phi
SVG(model_to_dot(inference, show_shapes=True)
.create(prog='dot', format='svg'))
w_sample_prior = prior.rvs(size=BATCH_SIZE)
w_sample_prior.shape
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
w_sample_posterior = inference.predict(eps)
w_sample_posterior.shape
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)))
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
metrics = discriminator.evaluate(inputs, targets)
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
def train_animate(epoch_num, batch_size=128, steps_per_epoch=20):
for step in range(steps_per_epoch):
w_sample_prior = prior.rvs(size=batch_size)
eps = np.random.randn(batch_size, NOISE_DIM)
w_sample_posterior = inference.predict(eps)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(batch_size), np.ones(batch_size)))
metrics = discriminator.train_on_batch(inputs, targets)
ax.cla()
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
train_info['epoch'] = epoch_num
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('epoch: {epoch:2d}\n'
'accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
return ax
FuncAnimation(fig, train_animate, frames=50,
interval=200, # 5 fps
blit=False)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)))
metrics = discriminator.evaluate(inputs, targets)
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap='magma')
ax.scatter(*inputs.T, c=targets, alpha=.8, cmap='coolwarm')
train_info = dict(zip(discriminator.metrics_names, metrics))
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
y_pred = K.sigmoid(K.dot(
K.constant(w_grid),
K.transpose(K.constant(X))))
y_pred
y_true = K.ones((300, 300, 1))*K.constant(y)
y_true
llhs_keras = - K.binary_crossentropy(
y_pred,
y_true,
from_logits=False)
sess = K.get_session()
np.sum(llhs, axis=-1)
np.allclose(np.sum(llhs, axis=-1),
sess.run(K.sum(llhs_keras, axis=-1)))
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, sess.run(K.sum(llhs_keras, axis=-1)),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
def make_elbo(ratio_estimator):
def elbo(y_true, y_pred):
log_likelihood = - K.binary_crossentropy(y_pred, y_true)
kl_estimate = K.mean(ratio_estimator(y_pred), axis=-1)
return log_likelihood - kl_estimate
return elbo
elbo = make_elbo(ratio_estimator)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import keras.backend as K
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import logistic, multivariate_normal, norm
from scipy.special import expit
from keras.models import Model, Sequential
from keras.layers import Activation, Dense, Dot, Input
from keras.utils.vis_utils import model_to_dot
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
from IPython.display import SVG
plt.style.use('seaborn-notebook')
# display animation inline
plt.rc('animation', html='html5')
sns.set_context('notebook')
np.set_printoptions(precision=2,
edgeitems=3,
linewidth=80,
suppress=True)
K.tf.__version__
LATENT_DIM = 2
NOISE_DIM = 3
BATCH_SIZE = 128
D_BATCH_SIZE = 128
G_BATCH_SIZE = 128
PRIOR_VARIANCE = 2.
w_min, w_max = -5, 5
w1, w2 = np.mgrid[w_min:w_max:300j, w_min:w_max:300j]
w_grid = np.dstack((w1, w2))
w_grid.shape
prior = multivariate_normal(mean=np.zeros(LATENT_DIM),
cov=PRIOR_VARIANCE)
log_prior = prior.logpdf(w_grid)
log_prior.shape
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, log_prior, cmap='magma')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
x1 = np.array([ 1.5, 1.])
x2 = np.array([-1.5, 1.])
x3 = np.array([- .5, -1.])
X = np.vstack((x1, x2, x3))
X.shape
y1 = 1
y2 = 1
y3 = 0
y = np.stack((y1, y2, y3))
y.shape
def log_likelihood(w, x, y):
# equiv. to negative binary cross entropy
return np.log(expit(np.dot(w.T, x)*(-1)**(1-y)))
llhs = log_likelihood(w_grid.T, X.T, y)
llhs.shape
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(6, 2))
fig.tight_layout()
for i, ax in enumerate(axes):
ax.contourf(w1, w2, llhs[::,::,i], cmap=plt.cm.magma)
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
ax.set_title('$p(y_{{{0}}} \mid x_{{{0}}}, w)$'.format(i+1))
ax.set_xlabel('$w_1$')
if not i:
ax.set_ylabel('$w_2$')
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, np.sum(llhs, axis=2),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap='magma')
ax.scatter(*X.T, c=y, cmap='coolwarm')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
$T_{\psi}(x, z)$
Here we consider
$T_{\psi}(w)$
$T_{\psi} : \mathbb{R}^2 \to \mathbb{R}$
discriminator = Sequential(name='discriminator')
discriminator.add(Dense(10, input_dim=LATENT_DIM, activation='relu'))
discriminator.add(Dense(20, activation='relu'))
discriminator.add(Dense(1, activation=None, name='logit'))
discriminator.add(Activation('sigmoid'))
discriminator.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['binary_accuracy'])
ratio_estimator = Model(
inputs=discriminator.inputs,
outputs=discriminator.get_layer(name='logit').output)
SVG(model_to_dot(discriminator, show_shapes=True)
.create(prog='dot', format='svg'))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
Initial density ratio, prior to any training
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
discriminator.evaluate(prior.rvs(size=5), np.zeros(5))
$z_{\phi}(x, \epsilon)$
Here we only consider
$z_{\phi}(\epsilon)$
$z_{\phi}: \mathbb{R}^3 \to \mathbb{R}^2$
inference = Sequential()
inference.add(Dense(10, input_dim=NOISE_DIM, activation='relu'))
inference.add(Dense(20, activation='relu'))
inference.add(Dense(LATENT_DIM, activation=None))
inference.summary()
The variational parameters $\phi$ are the trainable weights of the approximate inference model
phi = inference.trainable_weights
phi
SVG(model_to_dot(inference, show_shapes=True)
.create(prog='dot', format='svg'))
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
w_posterior_samples = inference.predict(eps)
w_posterior_samples.shape
w_prior_samples = prior.rvs(size=BATCH_SIZE)
w_prior_samples.shape
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax.scatter(*w_posterior_samples.T, alpha=.6)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma, animate=True)
scatter_posterior = ax.scatter(*w_posterior_samples.T, alpha=.8)
scatter_prior = ax.scatter(*w_prior_samples.T, alpha=.8)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
t = ax.text(0.05, 0.85, 'step: 0',
transform=ax.transAxes, bbox=props)
plt.show()
def train_animate(epoch_num, batch_size=128, steps_per_epoch=20):
for step in range(steps_per_epoch):
w_sample_prior = prior.rvs(size=batch_size)
eps = np.random.randn(batch_size, NOISE_DIM)
w_sample_posterior = inference.predict(eps)
inputs = np.vstack((w_sample_prior, w_sample_posterior))
targets = np.hstack((np.zeros(batch_size), np.ones(batch_size)))
metrics = discriminator.train_on_batch(inputs, targets)
ax.cla()
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.scatter(*w_sample_posterior.T, alpha=.8)
ax.scatter(*w_sample_prior.T, alpha=.8)
train_info = dict(zip(discriminator.metrics_names, metrics))
train_info['epoch'] = epoch_num
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
ax.text(0.05, 0.05,
('epoch: {epoch:2d}\n'
'accuracy: {binary_accuracy:.2f}\n'
'loss: {loss:.2f}').format(**train_info),
transform=ax.transAxes, bbox=props)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
return ax
FuncAnimation(fig, train_animate, frames=50,
interval=200, # 5 fps
blit=False)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import keras.backend as K
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import logistic, multivariate_normal, norm
from scipy.special import expit
from keras.models import Model, Sequential
from keras.layers import Activation, Dense, Dot, Input
from keras.utils.vis_utils import model_to_dot
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
from IPython.display import SVG
plt.style.use('seaborn-notebook')
sns.set_context('notebook')
# display animation inline
plt.rc('animation', html='html5')
np.set_printoptions(precision=2,
edgeitems=3,
linewidth=80,
suppress=True)
K.tf.__version__
LATENT_DIM = 2
NOISE_DIM = 3
BATCH_SIZE = 128
D_BATCH_SIZE = 128
G_BATCH_SIZE = 128
PRIOR_VARIANCE = 2.
w_min, w_max = -5, 5
w1, w2 = np.mgrid[w_min:w_max:300j, w_min:w_max:300j]
w_grid = np.dstack((w1, w2))
w_grid.shape
prior = multivariate_normal(mean=np.zeros(LATENT_DIM),
cov=PRIOR_VARIANCE)
log_prior = prior.logpdf(w_grid)
log_prior.shape
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, log_prior, cmap='magma')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
x1 = np.array([ 1.5, 1.])
x2 = np.array([-1.5, 1.])
x3 = np.array([- .5, -1.])
X = np.vstack((x1, x2, x3))
X.shape
y1 = 1
y2 = 1
y3 = 0
y = np.stack((y1, y2, y3))
y.shape
def log_likelihood(w, x, y):
# equiv. to negative binary cross entropy
return np.log(expit(np.dot(w.T, x)*(-1)**(1-y)))
llhs = log_likelihood(w_grid.T, X.T, y)
llhs.shape
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(6, 2))
fig.tight_layout()
for i, ax in enumerate(axes):
ax.contourf(w1, w2, llhs[::,::,i], cmap=plt.cm.magma)
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
ax.set_title('$p(y_{{{0}}} \mid x_{{{0}}}, w)$'.format(i+1))
ax.set_xlabel('$w_1$')
if not i:
ax.set_ylabel('$w_2$')
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, np.sum(llhs, axis=2),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax.plot(*np.vstack((x1,x2,x3)).T, 'ro')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
$T_{\psi}(x, z)$
Here we consider
$T_{\psi}(w)$
$T_{\psi} : \mathbb{R}^2 \to \mathbb{R}$
discriminator = Sequential(name='discriminator')
discriminator.add(Dense(10, input_dim=LATENT_DIM, activation='relu'))
discriminator.add(Dense(20, activation='relu'))
discriminator.add(Dense(1, activation=None, name='logit'))
discriminator.add(Activation('sigmoid'))
discriminator.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['binary_accuracy'])
ratio_estimator = Model(
inputs=discriminator.inputs,
outputs=discriminator.get_layer(name='logit').output)
SVG(model_to_dot(discriminator, show_shapes=True)
.create(prog='dot', format='svg'))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
Initial density ratio, prior to any training
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
discriminator.evaluate(prior.rvs(size=5), np.zeros(5))
$z_{\phi}(x, \epsilon)$
Here we only consider
$z_{\phi}(\epsilon)$
$z_{\phi}: \mathbb{R}^3 \to \mathbb{R}^2$
inference = Sequential()
inference.add(Dense(10, input_dim=NOISE_DIM, activation='relu'))
inference.add(Dense(20, activation='relu'))
inference.add(Dense(LATENT_DIM, activation=None))
inference.summary()
The variational parameters $\phi$ are the trainable weights of the approximate inference model
phi = inference.trainable_weights
phi
SVG(model_to_dot(inference, show_shapes=True)
.create(prog='dot', format='svg'))
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
w_posterior_samples = inference.predict(eps)
w_posterior_samples.shape
w_prior_samples = prior.rvs(size=BATCH_SIZE)
w_prior_samples.shape
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax.scatter(*w_posterior_samples.T, alpha=.6)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma, animate=True)
scatter_posterior = ax.scatter(*w_posterior_samples.T, alpha=.8)
scatter_prior = ax.scatter(*w_prior_samples.T, alpha=.8)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
t = ax.text(0.05, 0.85, 'step: 0',
transform=ax.transAxes, bbox=props)
plt.show()
def prior_samples_gen(batch_size):
while True:
yield prior.rvs(size=batch_size)
def posterior_samples_gen(inference_model, batch_size):
while True:
eps = np.random.randn(batch_size, NOISE_DIM)
yield inference_model.predict(eps)
def discriminator_data_gen(inference_model, batch_size):
for samples_prior, samples_posterior in zip(prior_samples_gen(batch_size),
posterior_samples_gen(inference_model, batch_size)):
inputs = np.vstack((samples_prior, samples_posterior))
targets = np.hstack((np.zeros(batch_size), np.ones(batch_size)))
yield inputs, targets
h = discriminator.fit_generator(generator=discriminator_data_gen(inference, 128), steps_per_epoch=32, epochs=2)
h.history['loss'][-1]
metrics = discriminator.train_on_batch(D_input, D_labels)
def animate(step):
ax.cla()
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
info_dict = dict(zip(discriminator.metrics_names, metrics))
info_dict['step'] = step
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
t = ax.text(0.05, 0.85, 'step: 0',
transform=ax.transAxes, bbox=props)
scatter_posterior = ax.scatter(*w_posterior_samples.T, alpha=.8)
scatter_prior = ax.scatter(*w_prior_samples.T, alpha=.8)
return ax
FuncAnimation(fig, animate, frames=50,
interval=200, # 5 fps
blit=False)
fig, ax = plt.subplots(figsize=(7, 7))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
cset = ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma, animate=True)
scatter_posterior = ax.scatter(*w_posterior_samples.T, alpha=.8)
scatter_prior = ax.scatter(*w_prior_samples.T, alpha=.8)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
props = dict(boxstyle='round', facecolor='w', alpha=0.5)
t = ax.text(0.05, 0.85, 'step: 0',
transform=ax.transAxes, bbox=props)
plt.show()
cset.collections
from matplotlib.collections import PatchCollection
dir(cset.collections[0])
import matplotlib.patches as patches
from matplotlib.collections import PathCollection
fig, ax = plt.subplots(figsize=(7, 7))
ax.add_collection(cset.collections[4])
plt.show()
fig, ax = plt.subplots(figsize=(7, 7))
cset = ax.contourf(np.linspace(-3, 3, 32), np.linspace(-3, 3, 32), np.random.randn(32, 32), cmap='magma')
scat = ax.scatter(*np.random.randn(2, 128), alpha=.8)
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
plt.show()
def animate(step):
ax.cla()
ax.contourf(np.linspace(-3, 3, 32), np.linspace(-3, 3, 32), np.random.randn(32, 32), cmap='magma')
ax.scatter(*np.random.randn(2, 128), alpha=.8)
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
return scat
FuncAnimation(fig, animate, frames=25,
interval=200) # 5 fps
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import keras.backend as K
from keras.models import Model, Sequential
from keras.layers import Activation, Dense, Dot, Input
from keras.utils.vis_utils import model_to_dot
from scipy.stats import logistic, multivariate_normal, norm
from scipy.special import expit
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import SVG
plt.style.use('seaborn-notebook')
sns.set_context('notebook')
np.set_printoptions(precision=2,
edgeitems=3,
linewidth=80,
suppress=True)
K.tf.__version__
LATENT_DIM = 2
NOISE_DIM = 3
BATCH_SIZE = 128
D_BATCH_SIZE = 128
G_BATCH_SIZE = 128
PRIOR_VARIANCE = 2.
w_min, w_max = -5, 5
w1, w2 = np.mgrid[w_min:w_max:300j, w_min:w_max:300j]
w_grid = np.dstack((w1, w2))
w_grid.shape
prior = multivariate_normal(mean=np.zeros(LATENT_DIM),
cov=PRIOR_VARIANCE)
log_prior = prior.logpdf(w_grid)
log_prior.shape
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, log_prior, cmap='magma')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
x1 = np.array([ 1.5, 1.])
x2 = np.array([-1.5, 1.])
x3 = np.array([- .5, -1.])
X = np.vstack((x1, x2, x3))
X.shape
y1 = 1
y2 = 1
y3 = 0
y = np.stack((y1, y2, y3))
y.shape
def log_likelihood(w, x, y):
# equiv. to negative binary cross entropy
return np.log(expit(np.dot(w.T, x)*(-1)**(1-y)))
llhs = log_likelihood(w_grid.T, X.T, y)
llhs.shape
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(6, 2))
fig.tight_layout()
for i, ax in enumerate(axes):
ax.contourf(w1, w2, llhs[::,::,i], cmap=plt.cm.magma)
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
ax.set_title('$p(y_{{{0}}} \mid x_{{{0}}}, w)$'.format(i+1))
ax.set_xlabel('$w_1$')
if not i:
ax.set_ylabel('$w_2$')
plt.show()
fig, ax = plt.subplots(figsize=(6, 5))
c = ax.contourf(w1, w2, np.sum(llhs, axis=2),
cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.colorbar(c)
plt.show()
fig, ax = plt.subplots(figsize=(6, 5))
c = ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax.plot(*np.vstack((x1,x2,x3)).T, 'ro')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.colorbar(c)
plt.show()
$T_{\psi}(x, z)$
Here we consider
$T_{\psi}(w)$
$T_{\psi} : \mathbb{R}^2 \to \mathbb{R}$
discriminator = Sequential(name='discriminator')
discriminator.add(Dense(10, input_dim=LATENT_DIM, activation='relu'))
discriminator.add(Dense(20, activation='relu'))
discriminator.add(Dense(1, activation=None, name='logit'))
discriminator.add(Activation('sigmoid'))
discriminator.compile(optimizer='adam',
loss='binary_crossentropy')
ratio_estimator = Model(
inputs=discriminator.inputs,
outputs=discriminator.get_layer(name='logit').output)
SVG(model_to_dot(discriminator, show_shapes=True)
.create(prog='dot', format='svg'))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
Initial density ratio, prior to any training
fig, ax = plt.subplots(figsize=(5, 5))
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
discriminator.evaluate(prior.rvs(size=5), np.ones(5))
$z_{\phi}(x, \epsilon)$
Here we only consider
$z_{\phi}(\epsilon)$
$z_{\phi}: \mathbb{R}^3 \to \mathbb{R}^2$
inference = Sequential()
inference.add(Dense(10, input_dim=NOISE_DIM, activation='relu'))
inference.add(Dense(20, activation='relu'))
inference.add(Dense(LATENT_DIM, activation=None))
inference.summary()
The variational parameters $\phi$ are the trainable weights of the approximate inference model
phi = inference.trainable_weights
phi
SVG(model_to_dot(inference, show_shapes=True)
.create(prog='dot', format='svg'))
eps = np.random.randn(BATCH_SIZE, NOISE_DIM)
w_posterior_samples = inference.predict(eps)
w_posterior_samples.shape
w_prior_samples = prior.rvs(size=BATCH_SIZE)
w_prior_samples.shape
fig, ax = plt.subplots(figsize=(6, 5))
c = ax.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax.scatter(*w_posterior_samples.T, alpha=.6)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.colorbar(c)
plt.show()
fig, ax = plt.subplots(figsize=(5, 5))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.scatter(*w_posterior_samples.T, alpha=.6)
ax.scatter(*w_prior_samples.T, alpha=.6)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
sess = K.get_session()
w_prior_samples = K.random_normal(shape=(BATCH_SIZE, LATENT_DIM),
stddev=np.sqrt(PRIOR_VARIANCE))
eps = K.random_normal(shape=(BATCH_SIZE, NOISE_DIM))
w_posterior_samples = inference(eps)
w_posterior_samples
fig, ax = plt.subplots(figsize=(5, 5))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.scatter(*sess.run(w_posterior_samples).T, alpha=.6)
ax.scatter(*sess.run(w_prior_samples).T, alpha=.6)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
discrim_loss = K.mean(
K.binary_crossentropy(
discriminator(w_posterior_samples),
K.ones_like(discriminator(w_posterior_samples))) +
K.binary_crossentropy(
discriminator(w_prior_samples),
K.zeros_like(discriminator(w_prior_samples))))
discrim_loss.eval(session=sess)
opt = K.tf.train.AdamOptimizer(3e-3, beta1=0.9)
discrim_train_op = opt.minimize(discrim_loss,
var_list=discriminator.trainable_weights)
K.mean(ratio_estimator(w_posterior_samples))
K.expand_dims(K.constant(y), 1)
K.pow(K.constant(-1), 1-K.expand_dims(K.constant(y), 1))
K.dot(K.constant(X), K.transpose(w_posterior_samples))
K.dot(K.constant(X), K.transpose(w_posterior_samples))*K.pow(K.constant(-1), 1-K.expand_dims(K.constant(y), 1))
K.sigmoid(K.dot(K.constant(X), K.transpose(w_posterior_samples)) *
K.pow(K.constant(-1), 1-K.expand_dims(K.constant(y), 1)))
K.log(K.sigmoid(K.dot(K.constant(X), K.transpose(w_posterior_samples)) *
K.pow(K.constant(-1), 1-K.expand_dims(K.constant(y), 1))))
K.mean(K.log(K.sigmoid(K.dot(K.constant(X), K.transpose(w_posterior_samples)) *
K.pow(K.constant(-1), 1-K.expand_dims(K.constant(y), 1)))))
log_likelihood = K.mean(K.log(K.sigmoid(K.dot(K.constant(X), K.transpose(w_posterior_samples)) *
K.pow(K.constant(-1), 1-K.expand_dims(K.constant(y), 1)))))
log_likelihood
inference_loss = K.mean(ratio_estimator(w_posterior_samples)) - log_likelihood
inference_loss.eval(session=sess)
inference_train_op = opt.minimize(inference_loss,
var_list=inference.trainable_weights)
keras_llh = K.reshape(K.sum(K.log(K.sigmoid(K.dot(K.constant(X), K.reshape(K.permute_dimensions(K.constant(w_grid), (2, 0, 1)), shape=(2, 300*300))) *
K.pow(K.constant(-1), 1-K.expand_dims(K.constant(y), 1)))),
axis=0),
shape=(300, 300))
keras_llh
fig, ax = plt.subplots(figsize=(6, 5))
c = ax.contourf(w1, w2, sess.run(keras_llh), cmap=plt.cm.magma)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.colorbar(c)
plt.show()
np.allclose(sess.run(keras_llh),
np.sum(llhs, axis=2))
sess = K.get_session()
for d_step in range(3*300):
loss, _ = sess.run([discrim_loss, discrim_train_op])
print(loss)
fig, ax = plt.subplots(figsize=(5, 5))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.scatter(*sess.run(w_posterior_samples).T, alpha=.6)
ax.scatter(*sess.run(w_prior_samples).T, alpha=.6)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
for step in range(3*200):
g_loss, _ = sess.run([inference_loss, inference_train_op])
for d_step in range(3*50):
d_loss, _ = sess.run([discrim_loss, discrim_train_op])
print(d_loss, g_loss)
fig, ax = plt.subplots(figsize=(5, 5))
w_grid_ratio = ratio_estimator.predict(w_grid.reshape(300*300, 2))
w_grid_ratio = w_grid_ratio.reshape(300, 300)
ax.contourf(w1, w2, w_grid_ratio, cmap=plt.cm.magma)
ax.scatter(*sess.run(w_posterior_samples).T, alpha=.8)
ax.scatter(*sess.run(w_prior_samples).T, alpha=.8)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_xlim(w_min, w_max)
ax.set_ylim(w_min, w_max)
plt.show()
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(9, 4))
sns.kdeplot(*sess.run(w_posterior_samples).T, cmap='magma', ax=ax2)
ax2.set_xlim(w_min, w_max)
ax2.set_ylim(w_min, w_max)
ax1.contourf(w1, w2,
np.exp(log_prior+np.sum(llhs, axis=2)),
cmap=plt.cm.magma)
ax1.scatter(*sess.run(w_posterior_samples).T, alpha=.6)
ax1.set_xlabel('$w_1$')
ax1.set_ylabel('$w_2$')
ax1.set_xlim(w_min, w_max)
ax1.set_ylim(w_min, w_max)
plt.show()
This is the third part in a series of notes on my exploration of the recently released Google QuickDraw dataset 1, using the concurrently released SketchRNN model.
The QuickDraw dataset is curated from the millions of drawings contributed by over 15 million people around the world who participated in the "Quick, Draw!" A.I. Experiment, in which they were given the challenge of drawing objects belonging to a particular class (such as "cat") in under 20 seconds.
SketchRNN is an impressive generative model that was trained to produce vector drawings using this dataset. It was of particular interest to me because it cleverly assembles many of the latest tools and techniques recently developed in machine learning, such as Variational Autoencoders, HyperLSTMs (a HyperNetwork for LSTM), Autoregressive models, Layer Normalization, Recurrent Dropout, the Adam optimizer, among others.
This is the third part in a series of notes on my exploration of the recently released Google QuickDraw dataset 1, using the concurrently released SketchRNN model.
The QuickDraw dataset is curated from the millions of drawings contributed by over 15 million people around the world who participated in the "Quick, Draw!" A.I. Experiment, in which they were given the challenge of drawing objects belonging to a particular class (such as "cat") in under 20 seconds.
SketchRNN is an impressive generative model that was trained to produce vector drawings using this dataset. It was of particular interest to me because it cleverly assembles many of the latest tools and techniques recently developed in machine learning, such as Variational Autoencoders, HyperLSTMs (a HyperNetwork for LSTM), Autoregressive models, Layer Normalization, Recurrent Dropout, the Adam optimizer, among others.
This is the second part in a series of notes on my exploration of the recently released Google QuickDraw dataset, using the concurrently released SketchRNN model.
In the previous note, we set up our development environment, downloaded a subset of the data along with some pre-trained models, and developed some utilities for visualizing the data in the notebook. We retain most of the code from previous note and omit the expository code and markdown cells.
The QuickDraw dataset is curated from the millions of drawings contributed by over 15 million people around the world who participated in the "Quick, Draw!" A.I. Experiment, in which they were given the challenge of drawing objects belonging to a particular class (such as "cat") in under 20 seconds.
SketchRNN is a very impressive generative model that was trained to produce vector drawings using this dataset. It was of particular interest to me because it cleverly combines many of the latest tools and techniques recently developed in machine learning, such as Variational Autoencoders, HyperLSTMs (a HyperNetwork for LSTM), Autoregressive models, Layer Normalization, Recurrent Dropout, the Adam optimizer, and others.
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import tensorflow as tf
from matplotlib.animation import FuncAnimation
from matplotlib.path import Path
from matplotlib import rc
from six.moves import map
from magenta.models.sketch_rnn.sketch_rnn_train import \
(load_env,
load_checkpoint,
reset_graph,
download_pretrained_models,
PRETRAINED_MODELS_URL)
from magenta.models.sketch_rnn.model import Model, sample
from magenta.models.sketch_rnn.utils import (lerp,
slerp,
get_bounds,
to_big_strokes,
to_normal_strokes)
# For inine display of animation
# equivalent to rcParams['animation.html'] = 'html5'
rc('animation', html='html5')
# set numpy output to something sensible
np.set_printoptions(precision=8,
edgeitems=6,
linewidth=200,
suppress=True)
tf.logging.info("TensorFlow Version: {}".format(tf.__version__))
DATA_DIR = ('http://github.com/hardmaru/sketch-rnn-datasets/'
'raw/master/aaron_sheep/')
MODELS_ROOT_DIR = '/tmp/sketch_rnn/models'
DATA_DIR
PRETRAINED_MODELS_URL
download_pretrained_models(
models_root_dir=MODELS_ROOT_DIR,
pretrained_models_url=PRETRAINED_MODELS_URL)
We look at the layer normalized model trained on the aaron_sheep dataset for now.
MODEL_DIR = MODELS_ROOT_DIR + '/aaron_sheep/layer_norm'
(train_set,
valid_set,
test_set,
hps_model,
eval_hps_model,
sample_hps_model) = load_env(DATA_DIR, MODEL_DIR)
class SketchPath(Path):
def __init__(self, data, factor=.2, *args, **kwargs):
vertices = np.cumsum(data[::, :-1], axis=0) / factor
codes = np.roll(self.to_code(data[::,-1].astype(int)),
shift=1)
codes[0] = Path.MOVETO
super(SketchPath, self).__init__(vertices,
codes,
*args,
**kwargs)
@staticmethod
def to_code(cmd):
# if cmd == 0, the code is LINETO
# if cmd == 1, the code is MOVETO (which is LINETO - 1)
return Path.LINETO - cmd
def draw(sketch_data, factor=.2, pad=(10, 10), ax=None):
if ax is None:
ax = plt.gca()
x_pad, y_pad = pad
x_pad //= 2
y_pad //= 2
x_min, x_max, y_min, y_max = get_bounds(data=sketch_data,
factor=factor)
ax.set_xlim(x_min-x_pad, x_max+x_pad)
ax.set_ylim(y_max+y_pad, y_min-y_pad)
sketch = SketchPath(sketch_data)
patch = patches.PathPatch(sketch, facecolor='none')
ax.add_patch(patch)
Everything up to here has more or less been copied straight from the previous notebook. Now we load the pre-trained SketchRNN model and use it to begin our exploration of the test dataset.
# construct the sketch-rnn model here:
reset_graph()
model = Model(hps_model)
eval_model = Model(eval_hps_model, reuse=True)
sample_model = Model(sample_hps_model, reuse=True)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
# loads the weights from checkpoint into our model
load_checkpoint(sess=sess, checkpoint_path=MODEL_DIR)
The helper functions for encoding a sketch to some latent code $z$ and then decoding it back to a sketch were provided in the original notebook. I just made some minor syntactic changes and removed the behaviour of plotting as a side-effect.
def encode(input_strokes):
strokes = to_big_strokes(input_strokes).tolist()
strokes.insert(0, [0, 0, 1, 0, 0])
seq_len = [len(input_strokes)]
z = sess.run(eval_model.batch_z,
feed_dict={
eval_model.input_data: [strokes],
eval_model.sequence_lengths: seq_len})[0]
return z
def decode(z_input=None, temperature=.1, factor=.2):
z = None
if z_input is not None:
z = [z_input]
sample_strokes, m = sample(
sess,
sample_model,
seq_len=eval_model.hps.max_seq_len,
temperature=temperature, z=z)
return to_normal_strokes(sample_strokes)
Now we get a random sample from the test dataset
sketch = test_set.random_sample()
fig, ax = plt.subplots(figsize=(3, 3),
subplot_kw=dict(xticks=[],
yticks=[],
frame_on=False))
draw(sketch, ax=ax)
plt.show()
We project it into the 128-dimensional latent space using the pre-trained encoder
z = encode(sketch)
z.shape
Now we can reconstruct the original sketch from the learned latent representation using the pre-trained decoder, with temperature $\tau=0.8$. The temperature parameter controls the level of randomness in the samples generated by the model, which becomes deterministic as $\tau \to 0$, and produces samples that are the most likely point in the probability density function. See pg. 7 of the original paper for further discussion of the effects the temperature parameter has on the sampling process.
sketch_reconstructed = decode(z, temperature=.6)
sketch_reconstructed.shape
fig, ax = plt.subplots(figsize=(3, 3),
subplot_kw=dict(xticks=[],
yticks=[],
frame_on=False))
draw(sketch_reconstructed, ax=ax)
plt.show()
The grid of drawings below consists of samples of the reconstructed drawings at various settings of the temperature parameter. The first column is the original drawing, and each of the remaining columns are 5 samples of the reconstructed drawing with $\tau$ increasing from 0.1 to 0.9.
fig, ax_arr = plt.subplots(nrows=5,
ncols=10,
figsize=(8, 4),
subplot_kw=dict(xticks=[],
yticks=[],
frame_on=False))
fig.tight_layout()
for row_num, ax_row in enumerate(ax_arr):
for col_num, ax in enumerate(ax_row):
if not col_num:
draw(sketch, ax=ax)
xlabel = 'original'
else:
t = col_num / 10.
draw(decode(z, temperature=t), ax=ax)
xlabel = r'$\tau={}$'.format(t)
if row_num+1 == len(ax_arr):
ax.set_xlabel(xlabel)
plt.show()
At the lowest setting of the temperature at $\tau=0.1$, we see the samples consistently share a similar appearance - they all look like vertical strokes emanating from a fluffy cloud. However, they are also consistently dissimilar to the original sketch. In this sense, the samples from the models seems to exhibit high bias and low variance. As we increase the variance in the samples by increasing $\tau$, we start to find some samples that resemble our original sketch. But when we increase $\tau$ a little too much, beyond say 0.8, we begin to see a little too much randomness in the samples.
Humans typically write and, by extension, draw from left to right, top to bottom. Here, I wanted to animate the process of the original sketch being drawn alongside the decoder's reconstruction of the sketch to compare stroke patterns, typical stroke lengths, etc.
fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(6, 3),
subplot_kw=dict(xticks=[],
yticks=[]))
fig.tight_layout()
x_pad, y_pad = 10, 10
x_pad //= 2
y_pad //= 2
(x_min_1,
x_max_1,
y_min_1,
y_max_1) = get_bounds(data=sketch, factor=.2)
(x_min_2,
x_max_2,
y_min_2,
y_max_2) = get_bounds(data=sketch_reconstructed, factor=.2)
x_min = np.minimum(x_min_1, x_min_2)
y_min = np.minimum(y_min_1, y_min_2)
x_max = np.maximum(x_max_1, x_max_2)
y_max = np.maximum(y_max_1, y_max_2)
ax1.set_xlim(x_min-x_pad, x_max+x_pad)
ax1.set_ylim(y_max+y_pad, y_min-y_pad)
ax1.set_xlabel('Original')
ax2.set_xlim(x_min-x_pad, x_max+x_pad)
ax2.set_ylim(y_max+y_pad, y_min-y_pad)
ax2.set_xlabel('Reconstruction')
def animate(i):
original = SketchPath(sketch[:i+1])
reconstructed = SketchPath(sketch_reconstructed[:i+1])
patch1 = ax1.add_patch(patches.PathPatch(original,
facecolor='none'))
patch2 = ax2.add_patch(patches.PathPatch(reconstructed,
facecolor='none'))
return patch1, patch2
frames = np.maximum(sketch.shape[0],
sketch_reconstructed.shape[0])
frames
FuncAnimation(fig,
animate,
frames=frames-1,
interval=15,
repeat_delay=1000*3,
blit=True)
Unfortunately, the strokes that make up a sketch have been normalized with the Ramer–Douglas–Peucker algorithm, which is a simple stroke simplification process. This means the strokes aren't quite the same as the that which the human originally used to construct the sketch. Moreover, the timing of each stroke are also important to understanding patterns in how humans draw quick sketches. While timestamp data is provided in the full QuickDraw dataset, they are not preserved in the modified version of the dataset used by SketchRNN.